Mapping Open Science Literature

Authors
Affiliations

Isabelle Dorsch

ZBW Leibniz Information Centre for Economics

Madelaine Hare

University of Ottawa

Philippe Mongeon

Dalhousie University

Isabella Peters

ZBW Leibniz Information Centre for Economics and CAU Kiel University

Published

March 18, 2024

Introduction

Research objectives

Data and Methods

Data collection

Code
files <- str_c("../data/",list.files("../data/", pattern = "RData"))
for (file in files) load(file)

Database design

Code
dbExecute(db, 
"CREATE TABLE IF NOT EXISTS doc_types (
  id SMALLINT,
  doc_type TEXT,
  PRIMARY KEY (id)
  );")

dbExecute(db, 
"CREATE TABLE IF NOT EXISTS works (
  id TEXT,
  pub_year SMALLINT,
  title TEXT,
  abstract TEXT,
  doi TEXT,
  journal TEXT,
  PRIMARY KEY (id)
  );")

dbExecute(db, 
"CREATE TABLE IF NOT EXISTS works_doc_types (
  work_id TEXT,
  doc_type_id SMALLINT,
  PRIMARY KEY (work_id, doc_type_id),
  FOREIGN KEY (work_id) REFERENCES works (id),
  FOREIGN KEY (doc_type_id) REFERENCES doc_types (id)
  );")

dbExecute(db, 
          "CREATE TABLE IF NOT EXISTS citation_indexes (
            id SMALLINT,
            ci TEXT,
          PRIMARY KEY (id));")

dbExecute(db,
          "CREATE TABLE IF NOT EXISTS works_index (
            work_id TEXT,
            citation_index_id SMALLINT,
          PRIMARY KEY (work_id, citation_index_id),
          FOREIGN KEY (work_id) REFERENCES works (id),
          FOREIGN KEY (citation_index_id) REFERENCES citation_indexes (id)
          );")

dbExecute(db,
          "CREATE TABLE IF NOT EXISTS classification (
            id SMALLINT,
            classification TEXT,
            PRIMARY KEY (id)
          );")

dbExecute(db,
          "CREATE TABLE IF NOT EXISTS works_classification (
            work_id TEXT,
            classification_id SMALLINT,
            PRIMARY KEY (work_id, classification_id),
            FOREIGN KEY (work_id) REFERENCES works (id),
            FOREIGN KEY (classification_id) REFERENCES classification (id)
          );")


dbExecute(db, 
"CREATE TABLE IF NOT EXISTS keywords (
  id int,
  keyword text,
  PRIMARY KEY (id)
  );")

dbExecute(db, 
"CREATE TABLE IF NOT EXISTS works_keywords (
  work_id text,
  keyword_id int,
  PRIMARY KEY (work_id, keyword_id),
  FOREIGN KEY (work_id) REFERENCES works (id),
  FOREIGN KEY (keyword_id) REFERENCES keywords (id)
  );")

dbExecute(db, 
"CREATE TABLE IF NOT EXISTS citations (
  item_id_citing text,
  item_id_cited text,
PRIMARY KEY (item_id_citing, item_id_cited),
FOREIGN KEY (item_id_cited) REFERENCES works (id),
FOREIGN KEY (item_id_citing) REFERENCES works (id)
);")


dbExecute(db, 
"CREATE TABLE IF NOT EXISTS clusters (
  id INT,
  set TEXT,
  network_method TEXT,
  clustering_method TEXT,
  component INT,
  cluster INT,
PRIMARY KEY (id)
);")

dbExecute(db, 
"CREATE TABLE IF NOT EXISTS works_clusters (
  work_id TEXT,
  cluster_id INT,
PRIMARY KEY (work_id, cluster_id),
FOREIGN KEY (work_id) REFERENCES works (id),
FOREIGN KEY (cluster_id) REFERENCES clusters (id)
);")

Entity relationship diagram for the open science literature database

Data processing

Works

Code
# -------------------------------------
# Import initial publications set
# -------------------------------------

dbWriteTable(db, "works", open_science_init_set %>% 
  left_join(open_science_init_set_abstracts, by="item_id") %>% 
  select(id = item_id, 
         pub_year = pubyear, 
         title = item_title, 
         abstract,
         doi,
         journal = source_title) %>% 
    filter(!id %in% dbReadTable(db, "works")$id), 
  row.names=F, 
  append=T)


# -------------------------------------
# Import extented publications set
# -------------------------------------

dbWriteTable(db, "works", joint_set_citations_and_references_from_intial_set %>% 
  left_join(joint_set_abstracts, by="item_id") %>% 
  select(id = item_id, 
         pub_year = pubyear, 
         title = item_title, 
         abstract,
         doi, 
         journal = source_title) %>% 
  filter(!id %in% dbReadTable(db, "works")$id), row.names=F, append=T)

Keywords

Code
# -------------------------------------
# Unique keywords
# -------------------------------------

# Prepare data
works_keywords<-open_science_init_set %>% 
  select(work_id = item_id, keyword) %>% 
  separate_rows(keyword, sep = ',') %>% 
  mutate(keyword = str_remove_all(keyword, "[{}\"]")) %>%
  bind_rows(joint_set_citations_and_references_from_intial_set %>% 
    select(work_id = item_id, keyword) %>% 
    separate_rows(keyword, sep = ',') %>% 
    mutate(keyword = str_remove_all(keyword, "[{}\"]"))) %>%
  unique()

keywords <- works_keywords %>% 
  select(keyword) %>% 
  drop_na() %>% 
  unique() %>% 
  rownames_to_column("id")
  
works_keywords <- works_keywords %>% 
  inner_join(keywords, by="keyword") %>% 
  select(work_id, keyword_id = id)

# Write to database
dbWriteTable(db, "keywords", 
             keywords, 
             row.names=F, 
             append = T)


dbWriteTable(db, "works_keywords",
             works_keywords, 
             row.names=F,
             append =T)

Citations

Code
works <- dbGetQuery(db, "select id from works")

# -----------------------------------------------
# Citations to the initial publication set
# -----------------------------------------------

dbWriteTable(db, 
             "citations", 
             open_science_init_set_citations_ids,
             row.names = F,
             append = T)

# -----------------------------------------------
# References from the initial publication set 
# -----------------------------------------------

data<-open_science_init_set_refs_ids %>% 
  anti_join(dbReadTable(db, "citations"),
            by=c("item_id_citing","item_id_cited")) %>%
  filter(item_id_cited %in% works$id) %>%   
  filter(item_id_citing %in% works$id) %>% 
  select(-item_id) %>%
  unique() %>%
  drop_na()

dbWriteTable(db, 
             "citations",
             data,
             row.names = F, 
             append = T)

# ---------------------------------------------
# Citations to the extended publication set 
# ---------------------------------------------

data<-joint_set_citations_ids %>% 
  anti_join(dbReadTable(db, "citations"),
            by=c("item_id_citing","item_id_cited")) %>%
  filter(item_id_cited %in% works$id) %>%   
  filter(item_id_citing %in% works$id) %>%
  unique() %>%
  drop_na()

dbWriteTable(db, 
             "citations", 
             data,
             row.names = F, 
             append = T)

# -----------------------------------------------
# References from the extended publication set 
# -----------------------------------------------


data<-joint_set_refs_ids %>% 
  anti_join(dbReadTable(db, "citations"),
            by=c("item_id_citing","item_id_cited")) %>%
  filter(item_id_cited %in% works$id) %>%   
  filter(item_id_citing %in% works$id) %>% 
  select(-item_id) %>%
  unique() %>%
  drop_na()

dbWriteTable(db, 
             "citations",
             data,
             row.names = F, 
             append = T)

Clustering

Initial publications set

Network files
Code
citations<-dbReadTable(db, "citations")
load("../data/open_science_init_set.RData")

core<-open_science_init_set %>% 
  select(id = item_id)

# -----------------------------
# Produce network files
# -----------------------------

# BC
citations %>% 
  inner_join(citations, by="item_id_cited") %>%
  select(source = item_id_citing.x, target = item_id_citing.y) %>% 
  filter(source < target) %>%
  filter(source %in% core$id) %>%  
  filter(target %in% core$id) %>%  
  group_by(source, target) %>%
  reframe(weight = n()) %>%
  mutate(type = "undirected") %>% 
  unique() %>% 
  write_csv("../data/networks_initial_set/net_bc.csv")

# DC
citations %>%
  rename(source = item_id_citing, target = item_id_cited) %>% 
  filter(source %in% core$id) %>%  
  filter(target %in% core$id) %>%  
  mutate(weight = 1,
         type="directed") %>% 
  unique() %>% 
  write_csv("../data/networks_initial_set/net_dc.csv")  

# CC
citations %>%
  inner_join(citations, by="item_id_citing") %>% 
  select(source = item_id_cited.x, target = item_id_cited.y) %>% 
  filter(source < target) %>% 
  filter(source %in% core$id) %>%  
  filter(target %in% core$id) %>%  
  group_by(source, target) %>%
  summarize(weight = n()) %>%
  mutate(type = "undirected") %>% 
  unique() %>% 
  write_csv("../data/networks_initial_set/net_cc.csv")  

# BC_CC_DC
bind_rows(read_csv("../data/networks_initial_set/net_bc.csv"),
          read_csv("../data/networks_initial_set/net_cc.csv"), 
          read_csv("../data/networks_initial_set/net_dc.csv")) %>% 
  group_by(source, target) %>% 
  summarize(weight = sum(weight)) %>% 
  write_csv("../data/networks_initial_set/net_bc_cc_dc.csv")

# BC_CC
bind_rows(read_csv("../data/networks_initial_set/net_bc.csv"),
          read_csv("../data/networks_initial_set/net_cc.csv")) %>% 
  group_by(source, target) %>% 
  summarize(weight = sum(weight)) %>% 
  write_csv("../data/networks_initial_set/net_bc_cc.csv")

# BC_DC
bind_rows(read_csv("../data/networks_initial_set/net_bc.csv"),
          read_csv("../data/networks_initial_set/net_dc.csv")) %>% 
  group_by(source, target) %>% 
  summarize(weight = sum(weight)) %>% 
  write_csv("../data/networks_initial_set/net_bc_dc.csv")

# CC_DC
bind_rows(read_csv("../data/networks_initial_set/net_cc.csv"),
          read_csv("../data/networks_initial_set/net_dc.csv")) %>% 
  group_by(source, target) %>% 
  summarize(weight = sum(weight)) %>% 
  write_csv("../data/networks_initial_set/net_cc_dc.csv")
Nodes
Code
## nodes files (with clusters) ----

net_files <- tibble(file = list.files("../data/networks_initial_set/")) %>% 
  filter(str_starts(file, "net_"))


for(file in net_files$file) {
  
    network <- graph_from_data_frame(
      read.csv(str_c("../data/networks_initial_set/",file)),
      directed = F)
    V(network)$comp <- components(network)$membership
    #network <- induced_subgraph(network, V(network)$comp==1)
    V(network)$cluster_louvain <- cluster_louvain(network, weights = c(E(network)$Weight))$membership
    V(network)$cluster_leiden <- cluster_leiden(network, weights = c(E(network)$Weight))$membership
  #  V(network)$degree <- degree(network)
  #  V(network)$closeness <- closeness(network)
  #  V(network)$eigen_centrality <- eigen_centrality(network)$vector
    select(as_data_frame(network, "both")$vertices, 
                    id = name, 
                    component = comp,
                    cluster_louvain, 
                    cluster_leiden) %>% 
      write_csv(str_c("../data/networks_initial_set/",
                      str_replace(file, "net", "nodes")))
}
Write to db
Code
clusters_bc<-read_xlsx("../data/networks_initial_set/clusters_louvain.xlsx", 
                       sheet = "nodes_bc") %>% 
  mutate(set = "expanded",
         network_method = "bc",
         cluster_method = "louvain") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_bc_cc<-read_xlsx("../data/networks_initial_set/clusters_louvain.xlsx", 
                       sheet = "nodes_bc_cc") %>% 
  mutate(set = "expanded",
         network_method = "bc_cc",
         cluster_method = "louvain") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_bc_cc_dc<-read_xlsx("../data/networks_initial_set/clusters_louvain.xlsx", 
                       sheet = "nodes_bc_cc_dc") %>% 
  mutate(set = "expanded",
         network_method = "bc_cc_dc",
         cluster_method = "louvain") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_bc_dc<-read_xlsx("../data/networks_initial_set/clusters_louvain.xlsx", 
                       sheet = "nodes_bc_dc") %>% 
  mutate(set = "expanded",
         network_method = "bc_dc",
         cluster_method = "louvain") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_cc<-read_xlsx("../data/networks_initial_set/clusters_louvain.xlsx", 
                       sheet = "nodes_cc") %>% 
  mutate(set = "expanded",
         network_method = "cc",
         cluster_method = "louvain") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_cc_dc<-read_xlsx("../data/networks_initial_set/clusters_louvain.xlsx", 
                       sheet = "nodes_cc_dc") %>% 
  mutate(set = "expanded",
         network_method = "cc_dc",
         cluster_method = "louvain") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_dc<-read_xlsx("../data/networks_initial_set/clusters_louvain.xlsx", 
                       sheet = "nodes_dc") %>% 
  mutate(set = "expanded",
         network_method = "dc",
         cluster_method = "louvain") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters<-bind_rows(clusters_bc,
                    clusters_cc, 
                    clusters_dc, 
                    clusters_bc_cc, 
                    clusters_bc_dc, 
                    clusters_cc_dc, 
                    clusters_bc_cc_dc) %>% 
  unique() %>% 
  rename(clustering_method=cluster_method) %>% 
  rownames_to_column("id") %>% 
#  mutate(id = as.numeric(id)+dbGetQuery(db, "select max(id) as max_id from clusters")$max_id) %>% 
  mutate(set = "core")

dbWriteTable(db, "clusters", clusters, row.names=F, append = T)


works_clusters_bc<-read.csv("../data/networks_initial_set/nodes_bc.csv") %>% 
  mutate(network_method = "bc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_louvain) 
works_clusters_cc<-read.csv("../data/networks_initial_set/nodes_cc.csv") %>% 
  mutate(network_method = "cc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_dc<-read.csv("../data/networks_initial_set/nodes_dc.csv") %>% 
  mutate(network_method = "dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_bc_cc<-read.csv("../data/networks_initial_set/nodes_bc_cc.csv") %>% 
  mutate(network_method = "bc_cc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_bc_dc<-read.csv("../data/networks_initial_set/nodes_bc_dc.csv") %>% 
  mutate(network_method = "bc_dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_cc_dc<-read.csv("../data/networks_initial_set/nodes_cc_dc.csv") %>% 
  mutate(network_method = "cc_dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_bc_cc_dc<-read.csv("../data/networks_initial_set/nodes_bc_cc_dc.csv") %>% 
  mutate(network_method = "bc_cc_dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_louvain)

works_clusters<-bind_rows(works_clusters_bc,
                          works_clusters_cc,
                          works_clusters_dc,
                          works_clusters_bc_cc,
                          works_clusters_bc_dc,
                          works_clusters_cc_dc,
                          works_clusters_bc_cc_dc) %>%
  unique() %>% 
  inner_join(clusters %>% 
               select(id, network_method, component, cluster), by=c("network_method","component","cluster")) %>% 
  select(work_id, cluster_id = id)

dbWriteTable(db,"works_clusters", works_clusters, row.names = F, append = T)  
Code
clusters_bc<-read_xlsx("../data/networks_initial_set/clusters_leiden.xlsx", 
                       sheet = "nodes_bc") %>% 
  mutate(set = "expanded",
         network_method = "bc",
         cluster_method = "leiden") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_bc_cc<-read_xlsx("../data/networks_initial_set/clusters_leiden.xlsx", 
                       sheet = "nodes_bc_cc") %>% 
  mutate(set = "expanded",
         network_method = "bc_cc",
         cluster_method = "leiden") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_bc_cc_dc<-read_xlsx("../data/networks_initial_set/clusters_leiden.xlsx", 
                       sheet = "nodes_bc_cc_dc") %>% 
  mutate(set = "expanded",
         network_method = "bc_cc_dc",
         cluster_method = "leiden") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_bc_dc<-read_xlsx("../data/networks_initial_set/clusters_leiden.xlsx", 
                       sheet = "nodes_bc_dc") %>% 
  mutate(set = "expanded",
         network_method = "bc_dc",
         cluster_method = "leiden") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_cc<-read_xlsx("../data/networks_initial_set/clusters_leiden.xlsx", 
                       sheet = "nodes_cc") %>% 
  mutate(set = "expanded",
         network_method = "cc",
         cluster_method = "leiden") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_cc_dc<-read_xlsx("../data/networks_initial_set/clusters_leiden.xlsx", 
                       sheet = "nodes_cc_dc") %>% 
  mutate(set = "expanded",
         network_method = "cc_dc",
         cluster_method = "leiden") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_dc<-read_xlsx("../data/networks_initial_set/clusters_leiden.xlsx", 
                       sheet = "nodes_dc") %>% 
  mutate(set = "expanded",
         network_method = "dc",
         cluster_method = "leiden") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters<-bind_rows(clusters_bc,
                    clusters_cc, 
                    clusters_dc, 
                    clusters_bc_cc, 
                    clusters_bc_dc, 
                    clusters_cc_dc, 
                    clusters_bc_cc_dc) %>% 
  unique() %>% 
  rename(clustering_method=cluster_method) %>% 
  rownames_to_column("id") %>% 
  mutate(id = as.numeric(id)+dbGetQuery(db, "select max(id) as max_id from clusters")$max_id) %>% 
  mutate(set = "core")

dbWriteTable(db, "clusters", clusters, row.names=F, append = T)


works_clusters_bc<-read.csv("../data/networks_initial_set/nodes_bc.csv") %>% 
  mutate(network_method = "bc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_leiden) 
works_clusters_cc<-read.csv("../data/networks_initial_set/nodes_cc.csv") %>% 
  mutate(network_method = "cc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_dc<-read.csv("../data/networks_initial_set/nodes_dc.csv") %>% 
  mutate(network_method = "dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_bc_cc<-read.csv("../data/networks_initial_set/nodes_bc_cc.csv") %>% 
  mutate(network_method = "bc_cc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_bc_dc<-read.csv("../data/networks_initial_set/nodes_bc_dc.csv") %>% 
  mutate(network_method = "bc_dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_cc_dc<-read.csv("../data/networks_initial_set/nodes_cc_dc.csv") %>% 
  mutate(network_method = "cc_dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_bc_cc_dc<-read.csv("../data/networks_initial_set/nodes_bc_cc_dc.csv") %>% 
  mutate(network_method = "bc_cc_dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_leiden)

works_clusters<-bind_rows(works_clusters_bc,
                          works_clusters_cc,
                          works_clusters_dc,
                          works_clusters_bc_cc,
                          works_clusters_bc_dc,
                          works_clusters_cc_dc,
                          works_clusters_bc_cc_dc) %>%
  unique() %>% 
  inner_join(clusters %>% 
               select(id, network_method, component, cluster), by=c("network_method","component","cluster")) %>% 
  select(work_id, cluster_id = id)

dbWriteTable(db,"works_clusters", works_clusters, row.names = F, append = T)  
Cluster descriptions (Louvain)
Code
## List of nodes files ----

node_files <- tibble(file = list.files("../data/networks_initial_set/")) %>% 
  filter(str_starts(file, "nodes_")) %>% 
  mutate(file = str_c("../data/networks_initial_set/",file))

net<-list()
for(file in node_files$file) {
  net<-c(net, list(read_csv(file) %>% 
                     rename(cluster = cluster_louvain)))
}
names(net)<-str_remove(str_remove(node_files$file,"../data/networks_initial_set/"),"\\.csv")

## Cluster descriptions ----

i=1
clusters<-list()
for(i in 1:length(net)) {
  
  ### Component ----
  cluster_component <- net[[i]] %>% 
    select(cluster, component) %>% 
    unique()
  
  ### Size ----
  cluster_size<-net[[i]] %>% 
    group_by(cluster) %>% 
    summarize(n = n())
  
  ### Number of core papers ----
  load("../data/open_science_init_set.RData")
  cluster_core_papers<-net[[i]] %>% 
    inner_join(open_science_init_set %>%
                 select(id = item_id),
               by="id") %>% 
    group_by(cluster) %>% 
    summarize(n_core = n())
  rm(open_science_init_set)
  
  ### Journals ----
  clusters_journals <- net[[i]] %>%
    inner_join(dbGetQuery(db, "SELECT DISTINCT id, journal FROM works"),
               by="id") %>% 
    mutate(total = n()) %>% 
    group_by(journal) %>% 
    mutate(n_papers = n()) %>% 
    mutate(pct_papers = n()/total) %>% 
    ungroup() %>% 
    group_by(cluster) %>% 
    mutate(cluster_total = n()) %>% 
    ungroup() %>% 
    group_by(cluster, journal) %>% 
    mutate(n_cluster = n()) %>% 
    mutate(pct_cluster = n_cluster/cluster_total) %>%
    mutate(si = pct_cluster/pct_papers) %>% 
    ungroup() %>% 
    select(journal,cluster, n_papers,pct_papers,n_cluster,cluster_total,pct_cluster, si) %>% 
    unique() %>% 
    group_by(cluster) %>% 
    arrange(desc(si)) %>%
    mutate(rank = row_number()) %>% 
    filter(rank <= 10) %>% 
    select(cluster, journal) %>% 
    mutate(journal = paste(journal, collapse="; ")) %>% 
    unique() %>% 
    ungroup()
  
  ### keywords
   clusters_keywords <- net[[i]] %>%
    inner_join(dbGetQuery(db, 
                          "SELECT DISTINCT a.id, c.keyword 
                          FROM works a
                          JOIN works_keywords b on b.work_id = a.id
                          JOIN keywords c on c.id = b.keyword_id"),
               by="id") %>% 
    mutate(total = n()) %>%
    mutate(keyword = lemmatize_strings(keyword)) %>% 
    group_by(keyword) %>% 
    mutate(n_papers = n()) %>% 
    mutate(pct_papers = n()/total) %>% 
    ungroup() %>% 
    group_by(cluster) %>% 
    mutate(cluster_total = n()) %>% 
    ungroup() %>% 
    group_by(cluster, keyword) %>% 
    mutate(n_cluster = n()) %>% 
    mutate(pct_cluster = n_cluster/cluster_total) %>%
    mutate(si = pct_cluster/pct_papers) %>% 
    ungroup() %>% 
    select(keyword,cluster, n_papers,pct_papers,n_cluster,cluster_total,pct_cluster, si) %>% 
    unique() %>% 
    group_by(cluster) %>% 
    arrange(desc(si)) %>%
    mutate(rank = row_number()) %>% 
    filter(rank <= 10) %>% 
    select(cluster, keyword) %>% 
    mutate(keyword = paste(keyword, collapse="; ")) %>% 
    unique() %>% 
    ungroup()
  
  ### Most cited papers ----
  
  cluster_citations <- net[[i]] %>%
    inner_join(dbGetQuery(db,
                          "
                          SELECT DISTINCT 
                            a.id, 
                            a.title, 
                            a.pub_year, 
                            a.journal, 
                            count(distinct b.item_id_citing) as cited_by_count
                          FROM works a
                          JOIN citations b on b.item_id_cited = a.id
                          GROUP BY a.id, a.title, a.pub_year, a.journal"),
               by="id") %>%
    mutate(papers = str_c(title,". (",pub_year," )",". ",journal)) %>% 
    group_by(cluster) %>%
    arrange(desc(cited_by_count)) %>% 
    mutate(rank = row_number()) %>% 
    filter(rank <= 10) %>% 
    select(cluster, papers) %>% 
    mutate(papers = paste(papers, collapse="; ")) %>% 
    unique() %>% 
    ungroup()
 
  ### putting it all together----
  
  x<- cluster_component %>% 
    inner_join(cluster_size, by="cluster") %>%
    inner_join(cluster_core_papers, by="cluster") %>%
    inner_join(clusters_journals, by="cluster") %>% 
    inner_join(clusters_keywords, by="cluster") %>%  
    inner_join(cluster_citations, by="cluster") %>%  
    rename("Number of publications" = n, 
           "Number of core publications" = n_core,
           "Top journals" = journal, 
           "Top keywords" = keyword,
           "Top cited papers" = papers)
  
  x = list(x)
  names(x)<-names(net)[i]
  clusters<-c(clusters, x)
  #   writexl::write_xlsx(str_c("data/networks_initial_set/cluster_table_net_",names(net)[i],".xlsx"))
}
writexl::write_xlsx(clusters, "../data/networks_initial_set/clusters_louvain.xlsx")  
Cluster descriptions (Leiden)
Code
## List of nodes files ----

node_files <- tibble(file = list.files("../data/networks_initial_set/")) %>% 
  filter(str_starts(file, "nodes_")) %>% 
  mutate(file = str_c("../data/networks_initial_set/",file))

net<-list()
for(file in node_files$file) {
  net<-c(net, list(read_csv(file, show_col_types = F) %>% 
                     rename(cluster = cluster_leiden)))
}
names(net)<-str_remove(str_remove(node_files$file,"../data/networks_initial_set/"),"\\.csv")

## Cluster descriptions ----

i=1
clusters<-list()
for(i in 1:length(net)) {
  
  ### Component ----
  cluster_component <- net[[i]] %>% 
    select(cluster, component) %>% 
    unique()
  
  ### Size ----
  cluster_size<-net[[i]] %>% 
    group_by(cluster) %>% 
    summarize(n = n())

  ### Number of core papers
  load("../data/open_science_init_set.RData")
  cluster_core_papers<-net[[i]] %>% 
    inner_join(open_science_init_set %>%
                 select(id = item_id),
               by="id") %>% 
    group_by(cluster) %>% 
    summarize(n_core = n())
  rm(open_science_init_set)

  
  ### Journals ----
  clusters_journals <- net[[i]] %>%
    inner_join(dbGetQuery(db, "SELECT DISTINCT id, journal FROM works"),
               by="id") %>% 
    mutate(total = n()) %>% 
    group_by(journal) %>% 
    mutate(n_papers = n()) %>% 
    mutate(pct_papers = n()/total) %>% 
    ungroup() %>% 
    group_by(cluster) %>% 
    mutate(cluster_total = n()) %>% 
    ungroup() %>% 
    group_by(cluster, journal) %>% 
    mutate(n_cluster = n()) %>% 
    mutate(pct_cluster = n_cluster/cluster_total) %>%
    mutate(si = pct_cluster/pct_papers) %>% 
    ungroup() %>% 
    select(journal,cluster, n_papers,pct_papers,n_cluster,cluster_total,pct_cluster, si) %>% 
    unique() %>% 
    group_by(cluster) %>% 
    arrange(desc(si)) %>%
    mutate(rank = row_number()) %>% 
    filter(rank <= 10) %>% 
    select(cluster, journal) %>% 
    mutate(journal = paste(journal, collapse="; ")) %>% 
    unique() %>% 
    ungroup()
  
  ### keywords
   clusters_keywords <- net[[i]] %>%
    inner_join(dbGetQuery(db, 
                          "SELECT DISTINCT a.id, c.keyword 
                          FROM works a
                          JOIN works_keywords b on b.work_id = a.id
                          JOIN keywords c on c.id = b.keyword_id"),
               by="id") %>% 
    mutate(total = n()) %>%
    mutate(keyword = lemmatize_strings(keyword)) %>% 
    group_by(keyword) %>% 
    mutate(n_papers = n()) %>% 
    mutate(pct_papers = n()/total) %>% 
    ungroup() %>% 
    group_by(cluster) %>% 
    mutate(cluster_total = n()) %>% 
    ungroup() %>% 
    group_by(cluster, keyword) %>% 
    mutate(n_cluster = n()) %>% 
    mutate(pct_cluster = n_cluster/cluster_total) %>%
    mutate(si = pct_cluster/pct_papers) %>% 
    ungroup() %>% 
    select(keyword,cluster, n_papers,pct_papers,n_cluster,cluster_total,pct_cluster, si) %>% 
    unique() %>% 
    group_by(cluster) %>% 
    arrange(desc(si)) %>%
    mutate(rank = row_number()) %>% 
    filter(rank <= 10) %>% 
    select(cluster, keyword) %>% 
    mutate(keyword = paste(keyword, collapse="; ")) %>% 
    unique() %>% 
    ungroup()
  
  ### Most cited papers ----
  
  cluster_citations <- net[[i]] %>%
    inner_join(dbGetQuery(db,
                          "
                          SELECT DISTINCT 
                            a.id, 
                            a.title, 
                            a.pub_year, 
                            a.journal, 
                            count(distinct b.item_id_citing) as cited_by_count
                          FROM works a
                          JOIN citations b on b.item_id_cited = a.id
                          GROUP BY a.id, a.title, a.pub_year, a.journal"),
               by="id") %>%
    mutate(papers = str_c(title,". (",pub_year," )",". ",journal)) %>% 
    group_by(cluster) %>%
    arrange(desc(cited_by_count)) %>% 
    mutate(rank = row_number()) %>% 
    filter(rank <= 10) %>% 
    select(cluster, papers) %>% 
    mutate(papers = paste(papers, collapse="; ")) %>% 
    unique() %>% 
    ungroup()
 
  ### putting it all together----
  
  x<- cluster_component %>% 
    inner_join(cluster_size, by="cluster") %>%
    inner_join(cluster_core_papers, by="cluster") %>% 
    inner_join(clusters_journals, by="cluster") %>% 
    inner_join(clusters_keywords, by="cluster") %>%  
    inner_join(cluster_citations, by="cluster") %>%  
    rename("Number of publications" = n, 
           "Number of core publications" = n_core,
           "Top journals" = journal, 
           "Top keywords" = keyword,
           "Top cited papers" = papers)
  
  x = list(x)
  names(x)<-names(net)[i]
  clusters<-c(clusters, x)
  #   writexl::write_xlsx(str_c("data/networks_initial_set/cluster_table_net_",names(net)[i],".xlsx"))
}
writexl::write_xlsx(clusters, "../data/networks_initial_set/clusters_leiden.xlsx")  

Expanded publications set

Network files
Code
citations<-dbReadTable(db, "citations")

# -----------------------------
# Produce network files
# -----------------------------

# BC
citations %>% 
  inner_join(citations, by="item_id_cited") %>% 
  select(source = item_id_citing.x, target = item_id_citing.y) %>% 
  filter(source < target) %>% 
  group_by(source, target) %>%
  reframe(weight = n()) %>%
  mutate(type = "undirected") %>% 
  unique() %>% 
  write_csv("../data/networks/net_bc.csv")

# DC
citations %>%
  rename(source = item_id_citing, target = item_id_cited) %>% 
  mutate(weight = 1,
         type="directed") %>% 
  unique() %>% 
  write_csv("../data/networks/net_dc.csv")  

# CC
citations %>%
  inner_join(citations, by="item_id_citing") %>% 
  select(source = item_id_cited.x, target = item_id_cited.y) %>% 
  filter(source < target) %>% 
  group_by(source, target) %>%
  summarize(weight = n()) %>%
  mutate(type = "undirected") %>% 
  unique() %>% 
  write_csv("../data/networks/net_cc.csv")  

# BC_CC_DC
bind_rows(read_csv("../data/networks/net_bc.csv"),
          read_csv("../data/networks/net_cc.csv"), 
          read_csv("../data/networks/net_dc.csv")) %>% 
  group_by(source, target) %>% 
  summarize(weight = sum(weight)) %>% 
  write_csv("../data/networks/net_bc_cc_dc.csv")

# BC_CC
bind_rows(read_csv("../data/networks/net_bc.csv"),
          read_csv("../data/networks/net_cc.csv")) %>% 
  group_by(source, target) %>% 
  summarize(weight = sum(weight)) %>% 
  write_csv("../data/networks/net_bc_cc.csv")

# BC_DC
bind_rows(read_csv("../data/networks/net_bc.csv"),
          read_csv("../data/networks/net_dc.csv")) %>% 
  group_by(source, target) %>% 
  summarize(weight = sum(weight)) %>% 
  write_csv("../data/networks/net_bc_dc.csv")

# CC_DC
bind_rows(read_csv("../data/networks/net_cc.csv"),
          read_csv("../data/networks/net_dc.csv")) %>% 
  group_by(source, target) %>% 
  summarize(weight = sum(weight)) %>% 
  write_csv("../data/networks/net_cc_dc.csv")
Nodes
Code
## nodes files (with clusters) ----

# net_files <- tibble(file = list.files("../data/networks/")) %>% 
#   filter(str_starts(file, "net_"))
# 
# 
# for(file in net_files$file) {
#   network <- graph_from_data_frame(read.csv(str_c("../data/networks/",file)), directed = F)
#   V(network)$comp <- components(network)$membership
#   #network <- induced_subgraph(network, V(network)$comp==1)
#   V(network)$cluster_louvain <- cluster_louvain(network, weights = c(E(network)$Weight))$membership
#   V(network)$cluster_leiden <- cluster_leiden(network, weights = c(E(network)$Weight))$membership
# #  V(network)$degree <- degree(network)
# #  V(network)$closeness <- closeness(network)
# #  V(network)$eigen_centrality <- eigen_centrality(network)$vector
#   select(as_data_frame(network, "both")$vertices, 
#                   id = name, 
#                   component = comp,
#                   cluster_louvain, 
#                   cluster_leiden) %>% 
#     write_csv(str_c("../data/networks/",str_replace(file, "net", "nodes")))
# }
Write to DB
Code
clusters_bc<-read_xlsx("../data/networks/clusters_louvain.xlsx", 
                       sheet = "nodes_bc") %>% 
  mutate(set = "expanded",
         network_method = "bc",
         cluster_method = "louvain") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_bc_cc<-read_xlsx("../data/networks/clusters_louvain.xlsx", 
                       sheet = "nodes_bc_cc") %>% 
  mutate(set = "expanded",
         network_method = "bc_cc",
         cluster_method = "louvain") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_bc_cc_dc<-read_xlsx("../data/networks/clusters_louvain.xlsx", 
                       sheet = "nodes_bc_cc_dc") %>% 
  mutate(set = "expanded",
         network_method = "bc_cc_dc",
         cluster_method = "louvain") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_bc_dc<-read_xlsx("../data/networks/clusters_louvain.xlsx", 
                       sheet = "nodes_bc_dc") %>% 
  mutate(set = "expanded",
         network_method = "bc_dc",
         cluster_method = "louvain") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_cc<-read_xlsx("../data/networks/clusters_louvain.xlsx", 
                       sheet = "nodes_cc") %>% 
  mutate(set = "expanded",
         network_method = "cc",
         cluster_method = "louvain") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_cc_dc<-read_xlsx("../data/networks/clusters_louvain.xlsx", 
                       sheet = "nodes_cc_dc") %>% 
  mutate(set = "expanded",
         network_method = "cc_dc",
         cluster_method = "louvain") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_dc<-read_xlsx("../data/networks/clusters_louvain.xlsx", 
                       sheet = "nodes_dc") %>% 
  mutate(set = "expanded",
         network_method = "dc",
         cluster_method = "louvain") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters<-bind_rows(clusters_bc,
                    clusters_cc, 
                    clusters_dc, 
                    clusters_bc_cc, 
                    clusters_bc_dc, 
                    clusters_cc_dc, 
                    clusters_bc_cc_dc) %>% 
  unique() %>% 
  rename(clustering_method=cluster_method) %>% 
  rownames_to_column("id") %>% 
  mutate(id = as.numeric(id)+dbGetQuery(db, "select max(id) as max_id from clusters")$max_id) %>% 
  mutate(set = "expanded")
  

# dbWriteTable(db, "clusters", clusters, row.names=F, append = T)


works_clusters_bc<-read.csv("../data/networks/nodes_bc.csv") %>% 
  mutate(network_method = "bc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_louvain) 
works_clusters_cc<-read.csv("../data/networks/nodes_cc.csv") %>% 
  mutate(network_method = "cc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_dc<-read.csv("../data/networks/nodes_dc.csv") %>% 
  mutate(network_method = "dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_bc_cc<-read.csv("../data/networks/nodes_bc_cc.csv") %>% 
  mutate(network_method = "bc_cc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_bc_dc<-read.csv("../data/networks/nodes_bc_dc.csv") %>% 
  mutate(network_method = "bc_dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_cc_dc<-read.csv("../data/networks/nodes_cc_dc.csv") %>% 
  mutate(network_method = "cc_dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_bc_cc_dc<-read.csv("../data/networks/nodes_bc_cc_dc.csv") %>% 
  mutate(network_method = "bc_cc_dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_louvain)

works_clusters<-bind_rows(works_clusters_bc,
                          works_clusters_cc,
                          works_clusters_dc,
                          works_clusters_bc_cc,
                          works_clusters_bc_dc,
                          works_clusters_cc_dc,
                          works_clusters_bc_cc_dc) %>%
  unique() %>% 
  inner_join(clusters %>% 
               select(id, network_method, component, cluster), by=c("network_method","component","cluster")) %>% 
  select(work_id, cluster_id = id)

# dbWriteTable(db,"works_clusters", works_clusters, row.names = F, append = T)  
Code
clusters_bc<-read_xlsx("../data/networks/clusters_leiden.xlsx", 
                       sheet = "nodes_bc") %>% 
  mutate(set = "expanded",
         network_method = "bc",
         cluster_method = "leiden") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_bc_cc<-read_xlsx("../data/networks/clusters_leiden.xlsx", 
                       sheet = "nodes_bc_cc") %>% 
  mutate(set = "expanded",
         network_method = "bc_cc",
         cluster_method = "leiden") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_bc_cc_dc<-read_xlsx("../data/networks/clusters_leiden.xlsx", 
                       sheet = "nodes_bc_cc_dc") %>% 
  mutate(set = "expanded",
         network_method = "bc_cc_dc",
         cluster_method = "leiden") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_bc_dc<-read_xlsx("../data/networks/clusters_leiden.xlsx", 
                       sheet = "nodes_bc_dc") %>% 
  mutate(set = "expanded",
         network_method = "bc_dc",
         cluster_method = "leiden") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_cc<-read_xlsx("../data/networks/clusters_leiden.xlsx", 
                       sheet = "nodes_cc") %>% 
  mutate(set = "expanded",
         network_method = "cc",
         cluster_method = "leiden") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_cc_dc<-read_xlsx("../data/networks/clusters_leiden.xlsx", 
                       sheet = "nodes_cc_dc") %>% 
  mutate(set = "expanded",
         network_method = "cc_dc",
         cluster_method = "leiden") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters_dc<-read_xlsx("../data/networks/clusters_leiden.xlsx", 
                       sheet = "nodes_dc") %>% 
  mutate(set = "expanded",
         network_method = "dc",
         cluster_method = "leiden") %>% 
  select(network_method,
         cluster_method,
         component, 
         cluster) 

clusters<-bind_rows(clusters_bc,
                    clusters_cc, 
                    clusters_dc, 
                    clusters_bc_cc, 
                    clusters_bc_dc, 
                    clusters_cc_dc, 
                    clusters_bc_cc_dc) %>% 
  unique() %>% 
  rename(clustering_method=cluster_method) %>% 
  rownames_to_column("id") %>% 
  mutate(id = as.numeric(id)+dbGetQuery(db, "select max(id) as max_id from clusters")$max_id) %>% 
  mutate(set = "expanded")

# dbWriteTable(db, "clusters", clusters, row.names=F, append = T)


works_clusters_bc<-read.csv("../data/networks/nodes_bc.csv") %>% 
  mutate(network_method = "bc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_leiden) 
works_clusters_cc<-read.csv("../data/networks/nodes_cc.csv") %>% 
  mutate(network_method = "cc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_dc<-read.csv("../data/networks/nodes_dc.csv") %>% 
  mutate(network_method = "dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_bc_cc<-read.csv("../data/networks/nodes_bc_cc.csv") %>% 
  mutate(network_method = "bc_cc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_bc_dc<-read.csv("../data/networks/nodes_bc_dc.csv") %>% 
  mutate(network_method = "bc_dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_cc_dc<-read.csv("../data/networks/nodes_cc_dc.csv") %>% 
  mutate(network_method = "cc_dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_bc_cc_dc<-read.csv("../data/networks/nodes_bc_cc_dc.csv") %>% 
  mutate(network_method = "bc_cc_dc") %>% 
  select(work_id = id, network_method, component, cluster = cluster_leiden)

works_clusters<-bind_rows(works_clusters_bc,
                          works_clusters_cc,
                          works_clusters_dc,
                          works_clusters_bc_cc,
                          works_clusters_bc_dc,
                          works_clusters_cc_dc,
                          works_clusters_bc_cc_dc) %>%
  unique() %>% 
  inner_join(clusters %>% 
               select(id, network_method, component, cluster), by=c("network_method","component","cluster")) %>% 
  select(work_id, cluster_id = id)

# dbWriteTable(db,"works_clusters", works_clusters, row.names = F, append = T)  
Cluster descriptions (Louvain)
Code
## List of nodes files ----

node_files <- tibble(file = list.files("../data/networks/")) %>% 
  filter(str_starts(file, "nodes_")) %>% 
  mutate(file = str_c("../data/networks/",file))

net<-list()
for(file in node_files$file) {
  net<-c(net, list(read_csv(file, show_col_types = F) %>% 
                     rename(cluster = cluster_louvain)))
}
names(net)<-str_remove(str_remove(node_files$file,"../data/networks/"),"\\.csv")

## Cluster descriptions ----

i=1
clusters<-list()
for(i in 1:length(net)) {
  
  ### Component ----
  cluster_component <- net[[i]] %>% 
    select(cluster, component) %>% 
    unique()
  
  ### Size ----
  cluster_size<-net[[i]] %>% 
    group_by(cluster) %>% 
    summarize(n = n())
  
  ### Number of core papers ----
  load("../data/open_science_init_set.RData")
  cluster_core_papers<-net[[i]] %>% 
    inner_join(open_science_init_set %>%
                 select(id = item_id),
               by="id") %>% 
    group_by(cluster) %>% 
    summarize(n_core = n())
  rm(open_science_init_set)
  
  ### Journals ----
  clusters_journals <- net[[i]] %>%
    inner_join(dbGetQuery(db, "SELECT DISTINCT id, journal FROM works"),
               by="id") %>% 
    mutate(total = n()) %>% 
    group_by(journal) %>% 
    mutate(n_papers = n()) %>% 
    mutate(pct_papers = n()/total) %>% 
    ungroup() %>% 
    group_by(cluster) %>% 
    mutate(cluster_total = n()) %>% 
    ungroup() %>% 
    group_by(cluster, journal) %>% 
    mutate(n_cluster = n()) %>% 
    mutate(pct_cluster = n_cluster/cluster_total) %>%
    mutate(si = pct_cluster/pct_papers) %>% 
    ungroup() %>% 
    select(journal,cluster, n_papers,pct_papers,n_cluster,cluster_total,
           pct_cluster, si) %>% 
    unique() %>% 
    group_by(cluster) %>% 
    arrange(desc(si)) %>%
    mutate(rank = row_number()) %>% 
    filter(rank <= 10) %>% 
    select(cluster, journal) %>% 
    mutate(journal = paste(journal, collapse="; ")) %>% 
    unique() %>% 
    ungroup()
  
  ### keywords
   clusters_keywords <- net[[i]] %>%
    inner_join(dbGetQuery(db, 
                          "SELECT DISTINCT a.id, c.keyword 
                          FROM works a
                          JOIN works_keywords b on b.work_id = a.id
                          JOIN keywords c on c.id = b.keyword_id"),
               by="id") %>% 
    mutate(total = n()) %>%
    mutate(keyword = lemmatize_strings(keyword)) %>% 
    group_by(keyword) %>% 
    mutate(n_papers = n()) %>% 
    mutate(pct_papers = n()/total) %>% 
    ungroup() %>% 
    group_by(cluster) %>% 
    mutate(cluster_total = n()) %>% 
    ungroup() %>% 
    group_by(cluster, keyword) %>% 
    mutate(n_cluster = n()) %>% 
    mutate(pct_cluster = n_cluster/cluster_total) %>%
    mutate(si = pct_cluster/pct_papers) %>% 
    ungroup() %>% 
    select(keyword,cluster, n_papers,pct_papers,n_cluster,cluster_total,
           pct_cluster, si) %>% 
    unique() %>% 
    group_by(cluster) %>% 
    arrange(desc(si)) %>%
    mutate(rank = row_number()) %>% 
    filter(rank <= 10) %>% 
    select(cluster, keyword) %>% 
    mutate(keyword = paste(keyword, collapse="; ")) %>% 
    unique() %>% 
    ungroup()
  
  ### Most cited papers ----
  
  cluster_citations <- net[[i]] %>%
    inner_join(dbGetQuery(db,
                          "
                          SELECT DISTINCT 
                            a.id, 
                            a.title, 
                            a.pub_year, 
                            a.journal, 
                            count(distinct b.item_id_citing) as cited_by_count
                          FROM works a
                          JOIN citations b on b.item_id_cited = a.id
                          GROUP BY a.id, a.title, a.pub_year, a.journal"),
               by="id") %>%
    mutate(papers = str_c(title,". (",pub_year," )",". ",journal)) %>% 
    group_by(cluster) %>%
    arrange(desc(cited_by_count)) %>% 
    mutate(rank = row_number()) %>% 
    filter(rank <= 10) %>% 
    select(cluster, papers) %>% 
    mutate(papers = paste(papers, collapse="; ")) %>% 
    unique() %>% 
    ungroup()
 
  ### putting it all together----
  
  x<- cluster_component %>% 
    inner_join(cluster_size, by="cluster") %>%
    inner_join(cluster_core_papers, by="cluster") %>%
    inner_join(clusters_journals, by="cluster") %>% 
    inner_join(clusters_keywords, by="cluster") %>%  
    inner_join(cluster_citations, by="cluster") %>%  
    rename("Number of publications" = n, 
           "Number of core publications" = n_core,
           "Top journals" = journal, 
           "Top keywords" = keyword,
           "Top cited papers" = papers)
  
  x = list(x)
  names(x)<-names(net)[i]
  clusters<-c(clusters, x)
  #   writexl::write_xlsx(str_c("data/networks/cluster_table_net_",names(net)[i],".xlsx"))
}
writexl::write_xlsx(clusters, "../data/networks/clusters_louvain.xlsx")  
Cluster descriptions (Leiden)
Code
## List of nodes files ----

node_files <- tibble(file = list.files("../data/networks/")) %>% 
  filter(str_starts(file, "nodes_")) %>% 
  mutate(file = str_c("../data/networks/",file))

net<-list()
for(file in node_files$file) {
  net<-c(net, list(read_csv(file, show_col_types = F) %>% 
                     rename(cluster = cluster_leiden)))
}
names(net)<-str_remove(str_remove(node_files$file,"../data/networks/"),"\\.csv")

## Cluster descriptions ----

i=1
clusters<-list()
for(i in 1:length(net)) {
  
  ### Component ----
  cluster_component <- net[[i]] %>% 
    select(cluster, component) %>% 
    unique()
  
  ### Size ----
  cluster_size<-net[[i]] %>% 
    group_by(cluster) %>% 
    summarize(n = n())

  ### Number of core papers
  load("../data/open_science_init_set.RData")
  cluster_core_papers<-net[[i]] %>% 
    inner_join(open_science_init_set %>%
                 select(id = item_id),
               by="id") %>% 
    group_by(cluster) %>% 
    summarize(n_core = n())
  rm(open_science_init_set)

  
  ### Journals ----
  clusters_journals <- net[[i]] %>%
    inner_join(dbGetQuery(db, "SELECT DISTINCT id, journal FROM works"),
               by="id") %>% 
    mutate(total = n()) %>% 
    group_by(journal) %>% 
    mutate(n_papers = n()) %>% 
    mutate(pct_papers = n()/total) %>% 
    ungroup() %>% 
    group_by(cluster) %>% 
    mutate(cluster_total = n()) %>% 
    ungroup() %>% 
    group_by(cluster, journal) %>% 
    mutate(n_cluster = n()) %>% 
    mutate(pct_cluster = n_cluster/cluster_total) %>%
    mutate(si = pct_cluster/pct_papers) %>% 
    ungroup() %>% 
    select(journal,cluster, n_papers,pct_papers,n_cluster,cluster_total,pct_cluster, si) %>% 
    unique() %>% 
    group_by(cluster) %>% 
    arrange(desc(si)) %>%
    mutate(rank = row_number()) %>% 
    filter(rank <= 10) %>% 
    select(cluster, journal) %>% 
    mutate(journal = paste(journal, collapse="; ")) %>% 
    unique() %>% 
    ungroup()
  
  ### keywords
   clusters_keywords <- net[[i]] %>%
    inner_join(dbGetQuery(db, 
                          "SELECT DISTINCT a.id, c.keyword 
                          FROM works a
                          JOIN works_keywords b on b.work_id = a.id
                          JOIN keywords c on c.id = b.keyword_id"),
               by="id") %>% 
    mutate(total = n()) %>%
    mutate(keyword = lemmatize_strings(keyword)) %>% 
    group_by(keyword) %>% 
    mutate(n_papers = n()) %>% 
    mutate(pct_papers = n()/total) %>% 
    ungroup() %>% 
    group_by(cluster) %>% 
    mutate(cluster_total = n()) %>% 
    ungroup() %>% group_by(cluster, keyword) %>% 
    mutate(n_cluster = n()) %>% 
    mutate(pct_cluster = n_cluster/cluster_total) %>%
    mutate(si = pct_cluster/pct_papers) %>% 
    ungroup() %>% 
    select(keyword,cluster, n_papers,pct_papers,n_cluster,cluster_total,pct_cluster, si) %>% 
    unique() %>% 
    group_by(cluster) %>% 
    arrange(desc(si)) %>%
    mutate(rank = row_number()) %>% 
    filter(rank <= 10) %>% 
    select(cluster, keyword) %>% 
    mutate(keyword = paste(keyword, collapse="; ")) %>% 
    unique() %>% 
    ungroup()
   
### Most cited papers ----
  
  cluster_citations <- net[[i]] %>%
    inner_join(dbGetQuery(db,
                          "
                          SELECT DISTINCT 
                            a.id, 
                            a.title, 
                            a.pub_year, 
                            a.journal, 
                            count(distinct b.item_id_citing) as cited_by_count
                          FROM works a
                          JOIN citations b on b.item_id_cited = a.id
                          GROUP BY a.id, a.title, a.pub_year, a.journal"),
               by="id") %>%
    mutate(papers = str_c(title,". (",pub_year," )",". ",journal)) %>% 
    group_by(cluster) %>%
    arrange(desc(cited_by_count)) %>% 
    mutate(rank = row_number()) %>% 
    filter(rank <= 10) %>% 
    select(cluster, papers) %>% 
    mutate(papers = paste(papers, collapse="; ")) %>% 
    unique() %>% 
    ungroup()
 
  ### putting it all together----
  
  x<- cluster_component %>% 
    inner_join(cluster_size, by="cluster") %>%
    inner_join(cluster_core_papers, by="cluster") %>% 
    inner_join(clusters_journals, by="cluster") %>% 
    inner_join(clusters_keywords, by="cluster") %>%  
    inner_join(cluster_citations, by="cluster") %>%  
    rename("Number of publications" = n, 
           "Number of core publications" = n_core,
           "Top journals" = journal, 
           "Top keywords" = keyword,
           "Top cited papers" = papers)
  
  x = list(x)
  names(x)<-names(net)[i]
  clusters<-c(clusters, x)
  #   writexl::write_xlsx(str_c("data/networks/cluster_table_net_",names(net)[i],".xlsx"))
}
writexl::write_xlsx(clusters, "../data/networks/clusters_leiden.xlsx")  

Data analysis

Code
#Loading library

# New graph -----

works_clusters<-dbReadTable(db,"works_clusters")


clusters_rk <- dbReadTable(db, "clusters") %>% 
  filter(set == "expanded") %>% 
  filter(clustering_method == "louvain") %>% 
  inner_join(works_clusters, by=c("id" = "cluster_id")) %>% 
  group_by(id, network_method) %>% 
  mutate(size = n()) %>% 
  mutate(n_core = sum(core)) %>% 
  select(cluster_id = id, size, n_core, network_method) %>% 
  unique() %>% 
  group_by(network_method) %>% 
  mutate(rk = rank(desc(size))) %>% 
  ungroup() %>% 
  select(cluster_id, rk)

clusters<-dbReadTable(db, "clusters")
works<-dbGetQuery(db, "select distinct id from works")

data <- dbReadTable(db, "works_clusters") %>%
  as_tibble() %>% 
  inner_join(clusters, by=c("cluster_id"="id")) %>% 
  inner_join(works, by=c("work_id"="id")) %>% 
  inner_join(clusters_rk, by="cluster_id") %>% 
  mutate(core = ifelse(work_id %in% open_science_init_set$item_id,T,F))

  

data %>% 
  group_by(network_method, core) %>% 
  summarize(n = n()) %>% 
  ungroup() %>% 
  group_by(network_method) %>% 
  mutate(total = sum(n)) %>% 
  ungroup()

data %>% 
  ggplot() +
  aes(rk, fill=core) +
  geom_bar() +
  facet_wrap(facets = "network_method")
  
data %>%
  filter(core == T) %>% 
  ggplot() +
  aes(rk) +
  geom_bar() +
  facet_wrap(facets = "network_method")


clusters_overall2 <- dbGetQuery(db, "select * from clusters c 
join works_clusters wc on wc.cluster_id = c.id" )
  
  ggplot(clusters_overall2) +
    aes(cluster_number) +
    geom_histogram() +
    facet_wrap(facets = "network_method")

Keyword frequency distribution

Prepare data

Code
keywords_freq<- dbGetQuery(db, 
"SELECT DISTINCT c.keyword, COUNT(DISTINCT a.id) as n
FROM works a
JOIN works_keywords b on b.work_id = a.id
JOIN keywords c on c.id = b.keyword_id
GROUP BY c.keyword
ORDER BY n DESC")

writexl::write_xlsx(keywords_freq, "../data/keywords_freq.xlsx")

Results

Discussion and Conclusion.

Acknowledgments

References

Appendix - Cluster details

Warning

The tables below do not work well in dark mode, switch to light mode to explore the data.

The methods available are: BC, BC-CC, BC-CC-DC, BC-DC, CC, CC-DC, DC with as a prefix the clustering method that was used (-Louvain or Leiden). For example: BC-CC-Louvain is the clusters identified with the Louvain community detection algorith in the Bibliographic coupling + co-citation network.

Initial publications set

All publications

References

Klavans, R., & Boyack, K. W. (2017). Which type of citation analysis generates the most accurate taxonomy of scientific and technical knowledge? Journal of the Association for Information Science and Technology, 68(4), 984–998. https://doi.org/10.1002/asi.23734